# Importing all the required libraries
import numpy as np
import pandas as pd
import datetime as dt
import matplotlib.pyplot as plt
import seaborn as sns
from mpl_toolkits.mplot3d import Axes3D
from pandas_profiling import ProfileReport
from sklearn import linear_model
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import PolynomialFeatures
from scipy import stats
from statsmodels.formula.api import ols
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from mlxtend.feature_selection import ExhaustiveFeatureSelector as efs
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE, f_regression
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn import metrics
from sklearn.model_selection import cross_val_score
sns.set_style('whitegrid');
sns.set_context('notebook', font_scale=1);
sns.set_palette("Dark2")
%matplotlib inline
# Importing the dataset.
df = pd.read_csv('kc_house_data.csv')
df.head()
df.columns
df.info();
The dataset has a total of 21613 rows.
df.nunique()
As we can observe, the id column has lesser unique values than the number of rows. This indicates that some of the houses have been sold multiple times within 2014 and 2015.
df.describe().T
# Dropping the 'id' column
df.drop('id', axis=1, inplace=True)
# Convert to date object
df['date'] = pd.to_datetime(df['date'])
df['year'] = pd.DatetimeIndex(df['date']).year
df['month'] = pd.DatetimeIndex(df['date']).month
df.drop('date', axis=1, inplace=True)
df['age'] = df['year']-df['yr_built']
sns.boxplot(data = df, x = 'year', y = 'price');
# We can observe that there is not muh variation in the prices of houses sold in 2014 and 2015
#profile.to_notebook_iframe()
sns.boxplot(data = df, x = 'month', y = 'price'); # Not much difference again, can be dropped
# Dropping the month feature
df.drop('month', axis =1, inplace = True)
def renovated_(x):
if x == 0:
return 0
return 1
df['renovated'] = df['yr_renovated'].apply(renovated_)
df.head()
df['age'] = df['year']-df['yr_built']
df[df['age']<0]
def age_(x):
if x == -1:
return 1
return x
df['age'] = df['age'].apply(age_)
df[df['age']<0]
df.head()
print(df.columns)
print(len(list(df.columns)))
# Columns having Numerical data are
numeric_features = df.select_dtypes(include=[np.number])
print(list(numeric_features.columns))
print(len(list(numeric_features.columns)))
# Checking for houses with either 0 bedrooms or 0 bathrooms
questionable1 = df[(df.bedrooms == 0) | (df.bathrooms == 0)]
questionable1
# Dropping all the houses with either bedrooms or bathrooms == 0 since it is not really possible
i = 0
for row in questionable1.index:
df = df.drop([row])
i += 1
print('Dropped {} rows.'.format(i))
print()
df.info();
# Checking for houses with greater living area than their lot area
questionable2 = df[df['sqft_living'] > df['sqft_lot']]
questionable2
# Dropping all the houses with living area greater than the lot area since it is not really possible
i = 0
for row in questionable2.index:
df = df.drop([row])
i += 1
print('Dropped {} rows.'.format(i))
print()
df.info();
sns.distplot(df['price'], bins=100);
fig, axes = plt.subplots(2, 4,figsize=(12,8))
sns.scatterplot(x='sqft_living', y='price', data=df, ax=axes[0][0]);
sns.scatterplot(x='sqft_lot', y='price', data=df, ax=axes[0][1]);
sns.scatterplot(x='sqft_above', y='price', data=df, ax=axes[0][2]);
sns.scatterplot(x='lat', y='price', data=df, ax=axes[0][3]);
sns.scatterplot(x='long', y='price', data=df, ax=axes[1][0]);
sns.scatterplot(x='yr_built', y='price', data=df, ax=axes[1][1]);
sns.barplot(x='renovated', y='price', data=df, ax=axes[1][2]);
sns.scatterplot(x='age', y='price', data=df, ax=axes[1][3]);
plt.tight_layout();
fig, axes = plt.subplots(2, 2,figsize=(12,8))
sns.barplot(x='condition', y='price', data=df, ax = axes[0][0]);
sns.barplot(x='waterfront', y='price', data=df, ax = axes[0][1]);
sns.barplot(x='view', y='price', data=df, ax = axes[1][0]);
sns.barplot(x='floors', y='price', data=df, ax = axes[1][1]);
plt.tight_layout();
fig, axes = plt.subplots(7, 2,figsize=(13,18))
sns.scatterplot(x='yr_built', y='price', data=df, hue='waterfront', ax = axes[0][0]);
sns.barplot(x='renovated', y='price', data=df, hue='waterfront', ax = axes[0][1]);
sns.scatterplot(x='age', y='price', data=df, hue='waterfront', ax = axes[1][0]);
sns.scatterplot(x='sqft_living', y='price', data=df, hue='waterfront', ax = axes[1][1]);
sns.scatterplot(x='sqft_lot', y='price', data=df, hue='waterfront', ax = axes[2][0]);
sns.barplot(x='condition', y='price', data=df, hue='waterfront', ax = axes[2][1]);
sns.barplot(x='view', y='price', data=df, hue='waterfront', ax = axes[3][0]);
sns.barplot(x='grade', y='price', data=df, hue='waterfront', ax = axes[3][1]);
sns.barplot(x='floors', y='price', data=df, hue='waterfront', ax = axes[4][0]);
sns.scatterplot(x='bathrooms', y='price', data=df, hue='waterfront', ax = axes[4][1]);
sns.scatterplot(x='bedrooms', y='price', data=df, hue='waterfront', ax = axes[5][0]);
sns.scatterplot(x='zipcode', y='price', data=df, hue='waterfront', ax = axes[5][1]);
sns.scatterplot(x='lat', y='price', data=df, hue='waterfront', ax = axes[6][0]);
sns.scatterplot(x='long', y='price', data=df, hue='waterfront', ax = axes[6][1]);
plt.tight_layout();
fig, axes = plt.subplots(7, 2,figsize=(13,18))
sns.scatterplot(x='yr_built', y='price', data=df, hue='view', ax = axes[0][0]);
sns.barplot(x='renovated', y='price', data=df, hue='view', ax = axes[0][1]);
sns.scatterplot(x='age', y='price', data=df, hue='view', ax = axes[1][0]);
sns.scatterplot(x='sqft_living', y='price', data=df, hue='view', ax = axes[1][1]);
sns.scatterplot(x='sqft_lot', y='price', data=df, hue='view', ax = axes[2][0]);
sns.barplot(x='condition', y='price', data=df, hue='view', ax = axes[2][1]);
sns.barplot(x='waterfront', y='price', data=df, hue='view', ax = axes[3][0]);
sns.barplot(x='grade', y='price', data=df, hue='view', ax = axes[3][1]);
sns.barplot(x='floors', y='price', data=df, hue='view', ax = axes[4][0]);
sns.scatterplot(x='bathrooms', y='price', data=df, hue='view', ax = axes[4][1]);
sns.scatterplot(x='bedrooms', y='price', data=df, hue='view', ax = axes[5][0]);
sns.scatterplot(x='zipcode', y='price', data=df, hue='view', ax = axes[5][1]);
sns.scatterplot(x='lat', y='price', data=df, hue='view', ax = axes[6][0]);
sns.scatterplot(x='long', y='price', data=df, hue='view', ax = axes[6][1]);
plt.tight_layout();
fig, axes = plt.subplots(7, 2,figsize=(13,28))
sns.scatterplot(x='yr_built', y='price', data=df, hue='grade', ax = axes[0][0]);
sns.barplot(x='renovated', y='price', data=df, hue='grade', ax = axes[0][1]);
sns.scatterplot(x='age', y='price', data=df, hue='grade', ax = axes[1][0]);
sns.scatterplot(x='sqft_living', y='price', data=df, hue='grade', ax = axes[1][1]);
sns.scatterplot(x='sqft_lot', y='price', data=df, hue='grade', ax = axes[2][0]);
sns.barplot(x='condition', y='price', data=df, hue='grade', ax = axes[2][1]);
sns.barplot(x='waterfront', y='price', data=df, hue='grade', ax = axes[3][0]);
sns.barplot(x='view', y='price', data=df, hue='grade', ax = axes[3][1]);
sns.barplot(x='floors', y='price', data=df, hue='grade', ax = axes[4][0]);
sns.scatterplot(x='bathrooms', y='price', data=df, hue='grade', ax = axes[4][1]);
sns.scatterplot(x='bedrooms', y='price', data=df, hue='grade', ax = axes[5][0]);
sns.scatterplot(x='zipcode', y='price', data=df, hue='grade', ax = axes[5][1]);
sns.scatterplot(x='lat', y='price', data=df, hue='grade', ax = axes[6][0]);
sns.scatterplot(x='long', y='price', data=df, hue='grade', ax = axes[6][1]);
plt.tight_layout();
fig, axes = plt.subplots(7, 2,figsize=(13,24))
sns.scatterplot(x='yr_built', y='price', data=df, hue='condition', ax = axes[0][0]);
sns.barplot(x='renovated', y='price', data=df, hue='condition', ax = axes[0][1]);
sns.scatterplot(x='age', y='price', data=df, hue='condition', ax = axes[1][0]);
sns.scatterplot(x='sqft_living', y='price', data=df, hue='condition', ax = axes[1][1]);
sns.scatterplot(x='sqft_lot', y='price', data=df, hue='condition', ax = axes[2][0]);
sns.barplot(x='grade', y='price', data=df, hue='condition', ax = axes[2][1]);
sns.barplot(x='waterfront', y='price', data=df, hue='condition', ax = axes[3][0]);
sns.barplot(x='view', y='price', data=df, hue='condition', ax = axes[3][1]);
sns.barplot(x='floors', y='price', data=df, hue='condition', ax = axes[4][0]);
sns.scatterplot(x='bathrooms', y='price', data=df, hue='condition', ax = axes[4][1]);
sns.scatterplot(x='bedrooms', y='price', data=df, hue='condition', ax = axes[5][0]);
sns.scatterplot(x='zipcode', y='price', data=df, hue='condition', ax = axes[5][1]);
sns.scatterplot(x='lat', y='price', data=df, hue='condition', ax = axes[6][0]);
sns.scatterplot(x='long', y='price', data=df, hue='condition', ax = axes[6][1]);
plt.tight_layout();
fig, axes = plt.subplots(7, 2,figsize=(13,24))
sns.scatterplot(x='yr_built', y='price', data=df, hue='floors', ax = axes[0][0]);
sns.barplot(x='renovated', y='price', data=df, hue='floors', ax = axes[0][1]);
sns.scatterplot(x='age', y='price', data=df, hue='floors', ax = axes[1][0]);
sns.scatterplot(x='sqft_living', y='price', data=df, hue='floors', ax = axes[1][1]);
sns.scatterplot(x='sqft_lot', y='price', data=df, hue='floors', ax = axes[2][0]);
sns.barplot(x='grade', y='price', data=df, hue='floors', ax = axes[2][1]);
sns.barplot(x='waterfront', y='price', data=df, hue='floors', ax = axes[3][0]);
sns.barplot(x='view', y='price', data=df, hue='floors', ax = axes[3][1]);
sns.barplot(x='condition', y='price', data=df, hue='floors', ax = axes[4][0]);
sns.scatterplot(x='bathrooms', y='price', data=df, hue='floors', ax = axes[4][1]);
sns.scatterplot(x='bedrooms', y='price', data=df, hue='floors', ax = axes[5][0]);
sns.scatterplot(x='zipcode', y='price', data=df, hue='floors', ax = axes[5][1]);
sns.scatterplot(x='lat', y='price', data=df, hue='floors', ax = axes[6][0]);
sns.scatterplot(x='long', y='price', data=df, hue='floors', ax = axes[6][1]);
plt.tight_layout();
df1=df[['price', 'bedrooms', 'bathrooms', 'sqft_living',
'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
'sqft_above', 'sqft_basement', 'yr_built', 'renovated', 'zipcode',
'lat', 'long', 'sqft_living15', 'sqft_lot15', 'age']]
h = df1.hist(bins=25,figsize=(12,12),xlabelsize='10',ylabelsize='10')
plt.tight_layout();
# Boxplots for all the contiuous numerical features
fig, axes = plt.subplots(3, 3, figsize=(12,8));
sns.boxplot(df['price'], orient='vertical', ax=axes[0][0]);
sns.boxplot(df['bedrooms'], orient='vertical', ax=axes[0][1]);
sns.boxplot(df['bathrooms'], orient='vertical', ax=axes[0][2]);
sns.boxplot(df['sqft_living'], orient='vertical', ax=axes[1][0]);
sns.boxplot(df['sqft_lot'], orient='vertical', ax=axes[1][1]);
sns.boxplot(df['sqft_above'], orient='vertical', ax=axes[1][2]);
sns.boxplot(df['sqft_basement'], orient='vertical', ax=axes[2][0]);
sns.boxplot(df['sqft_living15'], orient='vertical', ax=axes[2][1]);
sns.boxplot(df['sqft_lot15'], orient='vertical', ax=axes[2][2]);
plt.tight_layout()
fig, axes = plt.subplots(2, 3,figsize=(13,8))
sns.boxplot(x=df['bedrooms'],y=df['price'], ax=axes[0][0])
sns.boxplot(x=df['floors'],y=df['price'], ax=axes[0][1])
sns.boxplot(x=df['condition'],y=df['price'], ax=axes[0][2])
sns.boxplot(x=df['view'],y=df['price'], ax=axes[1][0])
sns.boxplot(x=df['grade'],y=df['price'], ax=axes[1][1])
sns.boxplot(x=df['waterfront'],y=df['price'], ax=axes[1][2])
plt.tight_layout();
Observations:
fig, axes = plt.subplots(1, 1,figsize=(12,5))
sns.boxplot(x=df['bathrooms'],y=df['price'], ax=axes)
axes.set(xlabel='Bathrooms or Bedrooms', ylabel='Price');
plt.tight_layout();
Observations:
The Price levels vary significantly with the number of Bedrooms or Bathrooms.
plt.figure(figsize=(18,6))
sns.boxplot(x=df['zipcode'], y=df['price']);
plt.xticks(rotation=90)
plt.ylim(0,6e6)
plt.show()
fig=plt.figure(figsize=(13,14))
ax=fig.add_subplot(2,2,1, projection="3d")
ax.scatter(df['floors'],df['bedrooms'],df['bathrooms'],alpha=.5)
ax.set(xlabel='\nFloors',ylabel='\nBedrooms',zlabel='\nBathrooms or Bedrooms')
ax.set(ylim=[0,12])
ax=fig.add_subplot(2,2,2, projection="3d")
ax.scatter(df['floors'],df['bedrooms'],df['sqft_living'],alpha=.5)
ax.set(xlabel='\nFloors',ylabel='\nBedrooms',zlabel='\nsqft Living')
ax.set(ylim=[0,12])
ax=fig.add_subplot(2,2,3, projection="3d")
ax.scatter(df['sqft_living'],df['sqft_lot'],df['bathrooms'],alpha=.5)
ax.set(xlabel='\n sqft Living',ylabel='\nsqft Lot',zlabel='\nBathrooms or Bedrooms')
ax.set(ylim=[0,250000])
ax=fig.add_subplot(2,2,4, projection="3d")
ax.scatter(df['sqft_living'],df['sqft_lot'],df['bedrooms'], alpha=.5)
ax.set(xlabel='\n sqft Living',ylabel='\nsqft Lot',zlabel='Bedrooms')
ax.set(ylim=[0,250000]);
df_dm = df.copy()
# just take the year of sale
df_dm['sales_yr']=df_dm['year']
# add the age of the buildings when the houses were sold as a new column
df_dm['age']=df_dm['sales_yr'].astype(int)-df_dm['yr_built']
# add the age of the renovation when the houses were sold as a new column
df_dm['age_rnv']=0
df_dm['age_rnv']=df_dm['sales_yr'][df_dm['yr_renovated']!=0].astype(int)-df_dm['yr_renovated'][df_dm['yr_renovated']!=0]
df_dm['age_rnv'][df_dm['age_rnv'].isnull()]=0
# partition the age into bins
bins = [-2,0,5,10,25,50,75,100,100000]
labels = ['l_1','1_5','6_10','11_25','26_50','51_75','76_100','g_100']
df_dm['age_binned'] = pd.cut(df_dm['age'], bins=bins, labels=labels)
# partition the age_rnv into bins
bins = [-2,0,5,10,25,50,75,100000]
labels = ['l_1','1_5','6_10','11_25','26_50','51_75','g_75']
df_dm['age_rnv_binned'] = pd.cut(df_dm['age_rnv'], bins=bins, labels=labels)
# histograms for the binned columns
f, axes = plt.subplots(1, 2,figsize=(10,5))
p1=sns.countplot(df_dm['age_binned'],ax=axes[0])
for p in p1.patches:
height = p.get_height()
p1.text(p.get_x()+p.get_width()/2,height + 50,height,ha="center")
p2=sns.countplot(df_dm['age_rnv_binned'],ax=axes[1])
sns.despine(left=True, bottom=True)
for p in p2.patches:
height = p.get_height()
p2.text(p.get_x()+p.get_width()/2,height + 200,height,ha="center")
axes[0].set(xlabel='Age')
axes[0].yaxis.tick_left()
axes[1].yaxis.set_label_position("right")
axes[1].yaxis.tick_right()
axes[1].set(xlabel='Renovation Age');
# transform the factor values to be able to use in the model
df_dm = pd.get_dummies(df_dm, columns=['age_binned','age_rnv_binned'])
plt.tight_layout();
df_dm
df_dm.columns
# The Correlation Matrix after pre-processing
plt.figure(figsize=(22,12))
mask = np.triu(np.ones_like(df_dm.corr(), dtype=bool))
sns.heatmap(df_dm.corr(), annot=True, vmin=-1,vmax=1,center=0, mask=mask, cmap='RdBu_r', fmt='.2f');
Observations:
# Dropping the features with VIF > 5 since it indicates strong collinearity issues
df.drop(['sqft_living15', 'sqft_lot15', 'yr_built', 'yr_renovated', 'sqft_above', 'sqft_basement'], axis=1, inplace=True)
# Dropping the features with VIF > 5 since it indicates strong collinearity issues
df_dm.drop(['sqft_living15', 'sqft_lot15', 'yr_built', 'yr_renovated', 'sqft_above',
'sales_yr', 'renovated', 'age_rnv','sqft_basement'], axis=1, inplace=True)
# The Correlation Matrix after removing correlated features
plt.figure(figsize=(20,8))
mask = np.triu(np.ones_like(df_dm.corr(), dtype=bool))
sns.heatmap(df_dm.corr(), annot=True, vmin=-1,vmax=1,center=0, mask=mask, cmap='RdBu_r', fmt='.2f');
# First extract the target variable which is our House prices
Y = df_dm.price.values
# Drop price from the house dataframe and create a matrix out of the house data
df1 = df_dm.copy()
df1.drop('price', axis=1, inplace=True)
X = df1
# Store the column/feature names into a list "colnames"
colnames = df1.columns
print(colnames)
print(len(colnames))
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs
sfs1 = SFS(LinearRegression(),
k_features= (3, 20),
forward=True,
floating=False,
scoring = 'r2',
cv = 5,
n_jobs=-1)
sfs1.fit(X, Y)
fig1 = plot_sfs(sfs1.get_metric_dict(), kind='std_dev')
plt.title('Sequential Forward Selection (w. StdErr)')
plt.grid()
plt.show()
# Sequential Forward Selection (SFS)
sfs = SFS(LinearRegression(),
k_features=15,
forward=True,
floating=False,
scoring = 'r2',
cv = 5,
n_jobs=-1)
sfs.fit(X, Y)
print('SFS top 15 features:\n',sfs.k_feature_names_) # to get the final set of features
# Backward Feature Selection (SBS)
sbs = SFS(LinearRegression(),
k_features=15,
forward=False,
floating=False,
scoring = 'r2',
cv = 5,
n_jobs=-1)
sbs.fit(X, Y)
print('SBS top 15 features:\n',sbs.k_feature_names_) # to get the final set of features
# Sequential Forward Floating Selection(sffs)
sffs = SFS(LinearRegression(),
k_features=15,
forward=True,
floating=True,
cv=0)
sffs.fit(X, Y)
print('SFFS top 15 features:\n',sffs.k_feature_names_) # to get the final set of features
# emodel=efs(LinearRegression(), scoring='r2', n_jobs=-1, min_features=1,max_features=10, cv=5)
# emodel.fits(X, Y)
# If you see below model creates 30826 feature combinations from 10 features.Thats why its computationally very expensive.
X.columns[list(emodel.best_idx_)]
# Define dictionary to store our rankings
ranks = {}
# Create our function which stores the feature rankings to the ranks dictionary
def ranking(ranks, names, order=1):
minmax = MinMaxScaler()
ranks = minmax.fit_transform(order*np.array([ranks]).T).T[0]
ranks = map(lambda x: round(x,2), ranks)
return dict(zip(names, ranks))
# Using Linear Regression
lr = LinearRegression(normalize=True)
lr.fit(X,Y)
ranks["LinReg"] = ranking(np.abs(lr.coef_), colnames)
# Recursive Feature Elimination
rfe = RFE(lr, n_features_to_select=1, verbose =3 )
rfe.fit(X,Y)
ranks["RFE"] = ranking(list(map(float, rfe.ranking_)), colnames, order=-1)
# Using Ridge
ridge = Ridge(alpha = 7)
ridge.fit(X,Y)
ranks['Ridge'] = ranking(np.abs(ridge.coef_), colnames)
# Using Lasso
lasso = Lasso(alpha=.05)
lasso.fit(X, Y)
ranks["Lasso"] = ranking(np.abs(lasso.coef_), colnames)
# Using Random Forest Regressor
rf = RandomForestRegressor(n_jobs=-1, n_estimators=50, verbose=3)
rf.fit(X,Y)
ranks["RF"] = ranking(rf.feature_importances_, colnames);
# Create empty dictionary to store the mean value calculated from all the scores
r = {}
for name in colnames:
r[name] = round(np.mean([ranks[method][name]
for method in ranks.keys()]), 2)
methods = sorted(ranks.keys())
ranks["Mean"] = r
methods.append("Mean")
print("\t%s" % "\t".join(methods))
for name in colnames:
print("%s\t%s" % (name, "\t".join(map(str,
[ranks[method][name] for method in methods]))))
# Put the mean scores into a Pandas dataframe
meanplot = pd.DataFrame(list(r.items()), columns= ['Feature','Mean Ranking'])
# Sort the dataframe
meanplot = meanplot.sort_values('Mean Ranking', ascending=False)
# Let's plot the ranking of the features
sns.catplot(x="Mean Ranking", y="Feature", data = meanplot, kind="bar", height=6, aspect=1.5, palette='magma');
# Top 15/29 features from Feature Selection
l = ['bedrooms', 'bathrooms', 'sqft_living',
'waterfront', 'view', 'condition', 'grade',
'zipcode', 'lat', 'long', 'year', 'age',
'age_binned_11_25', 'age_binned_26_50',
'age_rnv_binned_1_5']
X = df_dm[l]
y = df_dm['price']
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=0.25, random_state=3)
for i in l:
print(i,'+',end=' ')
# Using OLS Method
model_multilm = ols('price ~ bedrooms + bathrooms + sqft_living + waterfront + view + condition + grade + zipcode + lat + long + year + age ', data=df)
result_ml = model_multilm.fit()
result_ml.summary()
lrmodel = LinearRegression();
lrmodel.fit(X_train, y_train);
y_predtr = lrmodel.predict(X_train)
y_predts = lrmodel.predict(X_test)
model_scoretr = lrmodel.score(X_train, y_train)
model_scorets = lrmodel.score(X_test,y_test)
r_squaretr = metrics.r2_score(y_train, y_predtr)
r_squarets = metrics.r2_score(y_test, y_predts)
msetr = metrics.mean_squared_error(y_train, y_predtr)
msets = metrics.mean_squared_error(y_test, y_predts)
maetr = metrics.median_absolute_error(y_train, y_predtr)
maets = metrics.median_absolute_error(y_test, y_predts)
print('The Report for Multiple Linear Regression Model is: \n')
print("R-Square (Train) : ", r_squaretr)
print("R-Square (Test) : ", r_squarets)
print()
print("Mean Squared Error (Train) : ", msetr)
print("Mean Squared Error (Test) : ", msets)
print()
print("Root Mean Squared Error (Train) : ", msetr**(1/2))
print("Root Mean Squared Error (Test) : ", msets**(1/2))
print()
print("Median Absolute Error (Train) : ", maetr)
print("Median Absolute Error (Test) : ", maets)
# A close to normal distribution of Residuals indicates correctness of the model
sns.distplot((y_test-y_predts), bins=50);
plt.xlim(-2000000, 2000000);
plt.show()
rand_regr = RandomForestRegressor(n_estimators=400,random_state=3)
rand_regr.fit(X_train, y_train)
y_predtr = rand_regr.predict(X_train)
y_predts = rand_regr.predict(X_test)
r_squaretr = rand_regr.score(X_train, y_train)
r_squarets = rand_regr.score(X_test,y_test)
msetr = metrics.mean_squared_error(y_train, y_predtr)
msets = metrics.mean_squared_error(y_test, y_predts)
maetr = metrics.median_absolute_error(y_train, y_predtr)
maets = metrics.median_absolute_error(y_test, y_predts)
print('The Report for Random Forest Model is: \n')
print("R-Square (Train) : ", r_squaretr)
print("R-Square (Test) : ", r_squarets)
print()
print("Mean Squared Error (Train) : ", msetr)
print("Mean Squared Error (Test) : ", msets)
print()
print("Root Mean Squared Error (Train) : ", msetr**(1/2))
print("Root Mean Squared Error (Test) : ", msets**(1/2))
print()
print("Median Absolute Error (Train) : ", maetr)
print("Median Absolute Error (Test) : ", maets)
xg = XGBRegressor()
xg.fit(X_train,y_train)
y_predtr = xg.predict(X_train)
y_predts = xg.predict(X_test)
r_squaretr = xg.score(X_train, y_train)
r_squarets = xg.score(X_test,y_test)
msetr = metrics.mean_squared_error(y_train, y_predtr)
msets = metrics.mean_squared_error(y_test, y_predts)
maetr = metrics.median_absolute_error(y_train, y_predtr)
maets = metrics.median_absolute_error(y_test, y_predts)
print('The Report for XGBoost Regression is: \n')
print("R-Square (Train) : ", r_squaretr)
print("R-Square (Test) : ", r_squarets)
print()
print("Mean Squared Error (Train) : ", msetr)
print("Mean Squared Error (Test) : ", msets)
print()
print("Root Mean Squared Error (Train) : ", msetr**(1/2))
print("Root Mean Squared Error (Test) : ", msets**(1/2))
print()
print("Median Absolute Error (Train) : ", maetr)
print("Median Absolute Error (Test) : ", maets)
lasso = Lasso(alpha=0.2, normalize=True)
# Fit the regressor to the data
lasso.fit(X,y)
# Compute and print the coefficients
lasso_coef = lasso.coef_
print(lasso_coef)
# Plot the coefficients
plt.plot(range(len(colnames)), lasso_coef)
plt.xticks(range(len(colnames)), colnames.values, rotation=60)
plt.margins(0.02)
plt.show()
lf = Lasso()
lf.fit(X_train,y_train)
print('The Report for Lasso Regression is: \n')
y_predtr = lf.predict(X_train)
y_predts = lf.predict(X_test)
r_squaretr = lf.score(X_train, y_train)
r_squarets = lf.score(X_test,y_test)
msetr = metrics.mean_squared_error(y_train, y_predtr)
msets = metrics.mean_squared_error(y_test, y_predts)
maetr = metrics.median_absolute_error(y_train, y_predtr)
maets = metrics.median_absolute_error(y_test, y_predts)
print("R-Square (Train) : ", r_squaretr)
print("R-Square (Test) : ", r_squarets)
print()
print("Mean Squared Error (Train) : ", msetr)
print("Mean Squared Error (Test) : ", msets)
print()
print("Root Mean Squared Error (Train) : ", msetr**(1/2))
print("Root Mean Squared Error (Test) : ", msets**(1/2))
print()
print("Median Absolute Error (Train) : ", maetr)
print("Median Absolute Error (Test) : ", maets)
print()
# Compute and print the coefficients
lf_coef = lf.coef_
print(lf_coef)
colnames = X_test.columns
# Plot the coefficients
plt.plot(range(len(colnames)), lf_coef)
plt.xticks(range(len(colnames)), colnames.values, rotation=20)
plt.margins(0.02)
plt.show()
est = GradientBoostingRegressor(n_estimators=400, max_depth=5,
loss='ls',min_samples_split=2,learning_rate=0.1)
est.fit(X_train, y_train)
y_predtr = est.predict(X_train)
y_predts = est.predict(X_test)
r_squaretr = est.score(X_train, y_train)
r_squarets = est.score(X_test,y_test)
msetr = metrics.mean_squared_error(y_train, y_predtr)
msets = metrics.mean_squared_error(y_test, y_predts)
maetr = metrics.median_absolute_error(y_train, y_predtr)
maets = metrics.median_absolute_error(y_test, y_predts)
print('The Report for Gradient Boosting Regression is: \n')
print("R-Square (Train) : ", r_squaretr)
print("R-Square (Test) : ", r_squarets)
print()
print("Mean Squared Error (Train) : ", msetr)
print("Mean Squared Error (Test) : ", msets)
print()
print("Root Mean Squared Error (Train) : ", msetr**(1/2))
print("Root Mean Squared Error (Test) : ", msets**(1/2))
print()
print("Median Absolute Error (Train) : ", maetr)
print("Median Absolute Error (Test) : ", maets)
ada = AdaBoostRegressor(n_estimators=50,
learning_rate=0.2,loss='exponential')
ada.fit(X_train, y_train)
y_predtr = ada.predict(X_train)
y_predts = ada.predict(X_test)
r_squaretr = ada.score(X_train, y_train)
r_squarets = ada.score(X_test,y_test)
msetr = metrics.mean_squared_error(y_train, y_predtr)
msets = metrics.mean_squared_error(y_test, y_predts)
maetr = metrics.median_absolute_error(y_train, y_predtr)
maets = metrics.median_absolute_error(y_test, y_predts)
print('The Report for Ada Boost Regressor is: \n')
print("R-Square (Train) : ", r_squaretr)
print("R-Square (Test) : ", r_squarets)
print()
print("Mean Squared Error (Train) : ", msetr)
print("Mean Squared Error (Test) : ", msets)
print()
print("Root Mean Squared Error (Train) : ", msetr**(1/2))
print("Root Mean Squared Error (Test) : ", msets**(1/2))
print()
print("Median Absolute Error (Train) : ", maetr)
print("Median Absolute Error (Test) : ", maets)
rd = Ridge(alpha=1)
rd.fit(X_train, y_train)
y_predtr = rd.predict(X_train)
y_predts = rd.predict(X_test)
r_squaretr = rd.score(X_train, y_train)
r_squarets = rd.score(X_test,y_test)
msetr = metrics.mean_squared_error(y_train, y_predtr)
msets = metrics.mean_squared_error(y_test, y_predts)
maetr = metrics.median_absolute_error(y_train, y_predtr)
maets = metrics.median_absolute_error(y_test, y_predts)
print('The Report for Ridge Regression is: \n')
print("R-Square (Train) : ", r_squaretr)
print("R-Square (Test) : ", r_squarets)
print()
print("Mean Squared Error (Train) : ", msetr)
print("Mean Squared Error (Test) : ", msets)
print()
print("Root Mean Squared Error (Train) : ", msetr**(1/2))
print("Root Mean Squared Error (Test) : ", msets**(1/2))
print()
print("Median Absolute Error (Train) : ", maetr)
print("Median Absolute Error (Test) : ", maets)
knnreg = KNeighborsRegressor(n_neighbors=15)
knnreg.fit(X_train, y_train)
print('The Report for KNN Regression is: \n')
y_predtr = knnreg.predict(X_train)
y_predts = knnreg.predict(X_test)
r_squaretr = knnreg.score(X_train, y_train)
r_squarets = knnreg.score(X_test,y_test)
msetr = metrics.mean_squared_error(y_train, y_predtr)
msets = metrics.mean_squared_error(y_test, y_predts)
maetr = metrics.median_absolute_error(y_train, y_predtr)
maets = metrics.median_absolute_error(y_test, y_predts)
print("R-Square (Train) : ", r_squaretr)
print("R-Square (Test) : ", r_squarets)
print()
print("Mean Squared Error (Train) : ", msetr)
print("Mean Squared Error (Test) : ", msets)
print()
print("Root Mean Squared Error (Train) : ", msetr**(1/2))
print("Root Mean Squared Error (Test) : ", msets**(1/2))
print()
print("Median Absolute Error (Train) : ", maetr)
print("Median Absolute Error (Test) : ", maets)